import pandas as pd

# Load the cleaned dataset
df = pd.read_csv('cleaned_with_male_head.csv')

# Use the grouped regions as before
df['region_grouped'] = df['region']
df.loc[df['region_grouped'].isin(['Northern Midlands and Mountains', 'Northwest']), 'region_grouped'] = 'Northern Midlands and Mountainous Area'
df.loc[df['region_grouped'].isin(['North Central Coast', 'South Central Coast']), 'region_grouped'] = 'North Central & Central Coastal Area'

# Regions order
regions = [
    'Red River Delta',
    'Northern Midlands and Mountainous Area',
    'North Central & Central Coastal Area',
    'Central Highlands',
    'Mekong River Delta'
]

# Compute counts: without (htype == 0 or False), with (htype == 1 or True)
df['htype'] = df['htype'].astype(int)  # Ensure 0/1
without = df[df['htype'] == 0].groupby('region_grouped').size().reindex(regions, fill_value=0)
with_gp = df[df['htype'] == 1].groupby('region_grouped').size().reindex(regions, fill_value=0)
total_obs = without + with_gp
total_hh = df.groupby('region_grouped')['code_id_hh'].nunique().reindex(regions, fill_value=0)

# Build table DataFrame
table9_data = {
    '': ['Without grandparent(s)', 'With grandparent(s)', 'Total observations', 'Total households']
}
for reg in regions:
    table9_data[reg] = [without.get(reg, 0), with_gp.get(reg, 0), total_obs.get(reg, 0), total_hh.get(reg, 0)]

table9 = pd.DataFrame(table9_data)
print("\nTable 9: Household structure by region^a\n")
print(table9)
table9.to_csv('revise_table_9_household_structure.csv', index=False)
print("Saved table9_household_structure.csv")
print("a The regions are grouped as per the paper.")